Set up

using <- function(...) {
    libs <- unlist(list(...))
    need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
    if(length(need) > 0){ 
        install.packages(need, repos = "https://cloud.r-project.org")
        need <- need[!unlist(lapply(need, require, character.only = T))]
        if (length(need) > 0) {
          if (!requireNamespace("BiocManager", quietly = T))
            install.packages("BiocManager")
          BiocManager::install(need)
        }
    }
    lapply(libs, require, character.only = T)
}
using("tidyverse", "DESeq2", "CAGEfightR", "gprofiler2", "pheatmap",
      "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
      "BSgenome.Hsapiens.UCSC.hg38", "ggseqlogo", "patchwork")

Import

# list of samples and bigWig file names
ds <- list.files(pattern = '.bw') %>%
  tibble(f = .) %>%
  separate(f, c('Name', 'sign', NA), '\\.', remove = F) %>%
  mutate(sign = ifelse(sign == 'plus', 'BigWigPlus', 'BigWigMinus')) %>%
  pivot_wider(names_from = "sign", values_from = "f") %>%
  mutate(nm = Name) %>%
  column_to_rownames("nm") %>%
  DataFrame()

# rearrange
bws <- list(plus = ds$BigWigPlus,
            minus = ds$BigWigMinus) %>%
  lapply(function(x) {
    BigWigFileList(x) %>%
      `names<-`(ds$Name)
  })

# genome
gn <- Seqinfo(genome = 'hg38')[c(paste0('chr', 1:22), 'chrX', 'chrY')]

# read in files
ctss <- quantifyCTSSs(plusStrand = bws$plus,
                      minusStrand = bws$minus,
                      design = ds,
                      genome = gn)
## Checking design...
## Checking supplied genome compatibility...
## Iterating over 1 genomic tiles in 54 samples using 10 worker(s)...
## Importing CTSSs from plus strand...
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## Importing CTSSs from minus strand...
## Merging strands...
## Formatting output...
## ### CTSS summary ###
## Number of samples: 54
## Number of CTSSs: 27.73 millions
## Sparsity: 94.81 %
## Type of rowRanges: StitchedGPos
## Final object size: 1.12 GB

Filter

As shown above, there are ~28M individual CTSSs, lots of which are noise. Going forwards we’ll only consider those that have any reads in more than one sample.

ctss <- calcTPM(ctss) %>%
  calcPooled(inputAssay = "TPM") %>%
  calcSupport(inputAssay = "counts",
              outputColumn = "support",
              unexpressed = 0)
sctss <- subset(ctss, support > 1) %>% calcTPM()
tibble(x = rowRanges(ctss)$support) %>%
  dplyr::count(x) %>%
  ggplot(aes(x = x, y = n)) +
  geom_vline(xintercept = 1.5, color = 'red') +
  geom_col(aes(fill = x), show.legend = F) +
  scale_y_log10(expand = expansion(c(0,.1)),
                breaks = 10^(c(2,4,6)),
                labels = parse(text = sprintf('10^{%d}', c(2,4,6)))) +
  labs(x = "# of samples with >0 reads", y = "Frequency")


Clustering

Promoters (unidirectional)

To call promoters, we’ll pool the signal across all samples to improve power as previously mentioned. Additionally, we’ll only consider CTSSs with pooled TPM >0.1 and aggregate ones within 20bp.

tcs <- clusterUnidirectionally(sctss,
                               pooledCutoff = 0.1,
                               mergeDist = 20)
## Splitting by strand...
## Slice-reduce to find clusters...
## Calculating statistics...
## Preparing output...
## Tag clustering summary:
##   Width   Count Percent
##   Total 2232638 1e+02 %
##     >=1 1863030 8e+01 %
##    >=10  327319 1e+01 %
##   >=100   42251 2e+00 %
##  >=1000      38 2e-03 %

Enhancers (bidirectional)

On the other hand, enhancers are characterized by weak bidirectional transcription of capped eRNAs, a feature that can be exploited for their identification. Specifically, 0.95 was chosen as the threshold on the measure of balanced bidirectionality (Bhattacharyya coefficient: ranges between 0 and 1). Given that a rather stringent threshold of balanced transcription is imposed, we use the original unfiltered set of CTSSs here. Subsequently, we require that the candidate enhancers to demonstrate significant bidirectionality in more than 1 samples.

bcs <- clusterBidirectionally(ctss, balanceThreshold = 0.95) %>%
  calcBidirectionality(samples = ctss)
## Pre-filtering bidirectional candidate regions...
## Retaining for analysis: 82%
## Splitting by strand...
## Calculating windowed coverage on plus strand...
## Calculating windowed coverage on minus strand...
## Calculating balance score...
## Slice-reduce to find bidirectional clusters...
## Calculating statistics...
## Preparing output...
## # Bidirectional clustering summary:
## Number of bidirectional clusters: 814055
## Maximum balance score: 1
## Minimum balance score: 0.950000014008683
## Maximum width: 2593
## Minimum width: 401
enhs <- subset(bcs, bidirectionality > 1)
tibble(x = bcs$bidirectionality) %>%
  dplyr::count(x) %>%
  ggplot(aes(x = x, y = n)) +
  geom_vline(xintercept = 1.5, color = 'red') +
  geom_col(aes(fill = x), show.legend = F) +
  scale_y_log10(expand = expansion(c(0,.1)),
                breaks = 10^(c(1,3,5)),
                labels = parse(text = sprintf('10^{%d}', c(1,3,5)))) +
  labs(x = "# samples demonstrating bidirectionality", y = "Frequency")


Cluster-based filter

Previous filters were at a CTSS-level, now we enforce cluster-level filtering by requiring that a cluster must have >1 sample showing >1 TPM.

tcs <- quantifyClusters(ctss,
                        clusters = tcs,
                        inputAssay = "counts") %>%
  calcTPM(totalTags = "totalTags") %>%
  calcSupport(inputAssay = "TPM",
              outputColumn = "support",
              unexpressed = 1)

enhs <- quantifyClusters(ctss,
                         clusters = enhs,
                         inputAssay = "counts") %>%
  calcTPM(totalTags = "totalTags") %>%
  calcSupport(inputAssay = "TPM",
              outputColumn = "support",
              unexpressed = 1)
tibble(x = rowRanges(tcs)$support) %>%
  mutate(ttl = sprintf('Unidirectional: %d passes', sum(x > 1))) %>%
  dplyr::count(ttl, x) %>%
  ggplot(aes(x = x, y = n)) +
  geom_vline(xintercept = 1.5, color = 'red') +
  geom_col(aes(fill = x), show.legend = F) +
  facet_grid(. ~ ttl, scales = "free_y") +
  scale_y_log10(expand = expansion(c(0,.1)),
                breaks = 10^(c(2,4,6)),
                labels = parse(text = sprintf('10^{%d}', c(2,4,6)))) +
  labs(x = "# of samples with >1 TPM", y = "Frequency") 

`

  1. How many enhancers pass the same filter? What do you think contributed to this discrepancy?

`

So let’s proceed to take those supported by at least 2 samples

tcs2 <- subset(tcs, support > 1)
enhs2 <- subset(enhs, support > 1)

Shape

We can further assess the “shape” of TSSs based on the distribution of signals: i.e., is there a single dominant peak, or multiple? The interquantile range of signals is one way to assess this pattern, and we only focus on highly expressed clusters for the sake of simplicity.

hi <- tcs2[rowMeans(assay(tcs2, 'TPM')) > 10] %>%
  calcShape(pooled = sctss,
            outputColumn = "IQR",
            shapeFunction = shapeIQR,
            lower = 0.05, upper = 0.95)
rowData(hi)$shape <- ifelse(rowData(hi)$IQR < 10, 'Sharp', 'Broad')

rowData(hi)[,c('IQR', 'shape')] %>% 
  as_tibble() %>% 
  group_by(shape) %>%
  mutate(kind = sprintf('%s: %d', shape, n())) %>%
  ungroup() %>%
  ggplot(aes(x = IQR)) +
  geom_histogram(aes(fill = kind), binwidth = 1) +
  geom_vline(xintercept = 9.5, color = 'red', linetype = 'dashed') +
  scale_fill_manual(values = c('#3976AF', '#F08536')) +
  coord_cartesian(xlim = c(0,100)) +
  scale_y_continuous(expand = expansion(c(0,.1))) +
  labs(x = "IQR (bp)", y = "Frequency")

Are these two classes different at a sequence level? We’ll consider the most enriched peak as the putative TSS in each cluster

pmts <- rowRanges(hi) %>% swapRanges() # get the peak instead of the entire cluster

`

  1. Recall the previous exercises from lecture 3 and compare the consensus sequence for the two different shape classes. Use BSgenome.Hsapiens.UCSC.hg38 instead of hg19

`


Genomic annotation

Unidirectional clusters

tcs2 <- assignTxType(tcs2,
                     txModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
                     outputColumn = "peakTxType",
                     swap = "thick")
## Finding hierachical overlaps with swapped ranges...
## ### Overlap summary: ###
##       txType count percentage
## 1   promoter 22971       38.2
## 2   proximal  5611        9.3
## 3    fiveUTR  1257        2.1
## 4   threeUTR  3882        6.5
## 5        CDS  4945        8.2
## 6       exon   879        1.5
## 7     intron  6285       10.4
## 8  antisense  7748       12.9
## 9 intergenic  6577       10.9

Bidirectional clusters

enhs2 <- assignTxType(enhs2,
                      txModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
                      outputColumn = "peakTxType",
                      swap = "thick")
## Finding hierachical overlaps with swapped ranges...
## ### Overlap summary: ###
##       txType count percentage
## 1   promoter  1192       40.3
## 2   proximal   862       29.1
## 3    fiveUTR    15        0.5
## 4   threeUTR    39        1.3
## 5        CDS    24        0.8
## 6       exon    42        1.4
## 7     intron   474       16.0
## 8  antisense     0        0.0
## 9 intergenic   310       10.5

Differential expression

Assignment to genes

From the previous transcript-based annotation we can easily apply aggregation to obtain gene-level groupings

tcs2 <- assignGeneID(tcs2,
                     geneModels = TxDb.Hsapiens.UCSC.hg38.knownGene,
                     outputColumn = "geneID")
## Extracting genes...
##   1613 genes were dropped because they have exons located on both strands
##   of the same reference sequence or on more than one reference sequence,
##   so cannot be represented by a single genomic range.
##   Use 'single.strand.genes.only=FALSE' to get all the genes in a
##   GRangesList object, or use suppressMessages() to suppress this message.
## Overlapping while taking distance to nearest TSS into account...
## Finding hierachical overlaps...
## ### Overlap Summary: ###
## Features overlapping genes: 67.93 %
## Number of unique genes: 16163

DEGs

As a sanity check we can do differential gene expression analysis using the aggregated count matrix (a pair of K27M-KO samples vs two K27M). And indeed we find ample DEGS, with IGF2 being a known upregulation target of K27M.

symbols <- mapIds(org.Hs.eg.db,
                  keys = rowData(tcs2)$geneID,
                  keytype = "ENTREZID",
                  column = "SYMBOL")
rowData(tcs2)$symbol <- as.character(symbols)
genelevel <- quantifyGenes(tcs2, 
                           genes = "geneID", 
                           inputAssay = "counts")
m <- assay(genelevel)
cd <- tibble(samp = colnames(m)) %>% 
  mutate(cond = ifelse(grepl('^cont', samp), 'ctrl', 'uc')) %>%
  column_to_rownames('samp')
dds <- DESeqDataSetFromMatrix(m, cd, ~cond) %>% DESeq()
res <- results(dds)
resLFC <- lfcShrink(dds, 2, type = 'apeglm')

volc <- function(r, x, y, ttl) {
  d <- as.data.frame(r) %>%
    dplyr::rename(x = !!x, y = !!y) %>%
    mutate(kind = case_when(abs(x) > 2 & y < .01 ~ 'DE',
                            abs(x) > 2 ~ 'big',
                            y < .01 ~ 'sig',
                            T ~ 'NS'),
           y = -log10(y))
  ct <- na.omit(d) %>% 
    dplyr::filter(kind == 'DE') %>%
    mutate(up = x > 0) %>%
    dplyr::count(up) %>%
    mutate(x = ifelse(up, Inf, -Inf),
           y = Inf,
           h = as.numeric(up))
  ggplot(d, aes(x, y, color = kind)) +
    geom_vline(xintercept = c(-2, 2), linetype = 'dashed') +
    geom_hline(yintercept = 2, linetype = 'dashed') +
    geom_point(alpha = .5) +
    geom_label(aes(x = x, y = y, label = n, hjust = h),
              vjust = 1, data = ct, inherit.aes = F) +
    scale_y_continuous() +
    scale_color_manual(values = c('forestgreen', 'red2', 'grey30',  'royalblue')) +
    labs(x = x, y = y, title = ttl)
}
volc(resLFC, 'log2FoldChange', 'padj', 'Gene-level')


Differential TSS usage

Filter

With CAGE we’d like to look the expression from individual TSS clusters: here we’ll just consider those that were successfully annotated to genes and constitute >10% of their cognate gene’s total expression level in >1 sample.

tc <- subset(tcs2, !is.na(geneID)) %>%
  calcComposition(outputColumn = "composition",
                  unexpressed = 0.1,
                  genes = "geneID")
tibble(x = rowRanges(tc)$composition) %>%
  dplyr::count(x) %>%
  ggplot(aes(x = x, y = n)) +
  geom_vline(xintercept = 1.5, color = 'red') +
  geom_col(aes(fill = x), show.legend = F) +
  scale_y_log10(expand = expansion(c(0,.1)),
                breaks = 10^(1:4),
                labels = parse(text = sprintf('10^{%d}', 1:4))) +
  labs(x = "# of samples with >10% contribution", y = "Frequency") 

Differential analysis

Now we proceed like we’ve done for genes.

m <- assay(tc)
dds <- DESeqDataSetFromMatrix(m, cd, ~cond) %>% DESeq()
res2 <- results(dds)
resLFC2 <- lfcShrink(dds, 2, type = 'apeglm')
volc(resLFC2, 'log2FoldChange', 'padj', 'Cluster-level')


Promoter switching

Having done both gene-level and TSS-level differential expression analysis, the results can be combined to investigate genes that experience DTU but not DE.

tc.data <- rowData(tc) %>% as.data.frame() %>% rownames_to_column('idx')
ns <- na.omit(res) %>% as.data.frame() %>% dplyr::filter(pvalue > .05) %>% rownames()
sres <- resLFC2 %>% 
  as.data.frame() %>%
  rownames_to_column('idx') %>% 
  dplyr::filter(padj < .05 & abs(log2FoldChange) > 1) %>%
  merge(tc.data) %>%
  dplyr::filter(geneID %in% ns)

`

  1. How many of such genes are there?

`

We can again examine the pathways affected

g <- gost(sres$symbol, 'hsapiens')
gostplot(g)

Correlating enhancer-promoter pairs

As we have lists of both potential enhancer and TSS elements across 54 samples, we could try finding those that are significaintly correlated in terms of their expression – putative cis-regulatory pairings.

rowRanges(tcs2)$clusterType <- "TSS"
rowRanges(enhs2)$clusterType <- "enhancer"
rse <- combineClusters(tcs2, enhs2, removeIfOverlapping = "object1")
rowData(rse)$clusterType <- factor(rowData(rse)$clusterType, c('TSS', 'enhancer'))
lnks <- findLinks(rse,
                  inputAssay = 'TPM',
                  maxDist = 1e4L,
                  directional = 'clusterType',
                  method = 'kendall')

d <- rowRanges(rse) %>%
  {.[overlapsAny(., lnks[order(lnks$p.value)[1]])]} %>%
  names() %>%
  assay(rse)[.,] %>%
  as.data.frame() %>% 
  rownames_to_column('idx') %>%
  pivot_longer(-idx, names_to = 'samp', values_to = 'v') %>%
  pivot_wider(names_from = 'idx', values_from = 'v') %>%
  mutate(cond = ifelse(grepl('^cont', samp), 'ctrl', 'uc'))

d %>%
  dplyr::rename(x = 2, y = 3) %>%
  ggplot(aes(x, y, color = cond)) +
  geom_point() +
  annotate(geom = 'label', x = -Inf, y = Inf, hjust = 0, vjust = 1,
           label = sprintf('\u3c4 = %.2f', cor(d[[2]], d[[3]], method = 'kendall'))) +
  labs(x = names(d)[2], y = names(d)[3])